<?php

namespace Iwester\Services;

use App;
use Carbon\Carbon;
use Illuminate\Support\Facades\Storage;
use Iwester\Http\Model\Spider\SpiderArticleContent;
use Iwester\Http\Model\Spider\SpiderTask;
use Log;
use QL\QueryList;

class SpiderService
{
    public function __construct()
    {
    }

    /**
     * 定时采集任务
     */
    public function spider()
    {
        try {
            set_time_limit(0);
            ini_set('memory_limit', -1);
            $task = SpiderTask::with('listConfig')->where('status', 1)->where('next_spider_time', '<', Carbon::now())
                ->orderBy('priority', 'desc')->first();
            if (!$task) {
                \Log::info('暂无任务');
                return;
            };
            if ($task->freq > 0) {
                $task->cur_spider_time = Carbon::now();
                $task->next_spider_time = Carbon::now()->addHour(SpiderTask::$freqHours[$task->freq]);
                $task->save();
            }
            \Log::info('开始任务 ID= '. $task->id);
            $this->spiderItem($task);
        } catch (\Exception $e) {
            \Log::info($e->getMessage());
        }
    }

    /**
     * 定时采集任务 - 单个开始
     * @param $task
     */
    public function spiderItem($task)
    {
        try {
            $listConfig = $task->listConfig;
            $list_param = json_decode($listConfig->list_url_content, true);
            if ($listConfig->list_url == '' || $listConfig->list_url_content == '') {
                \Log::info('缺少参数');
                return;
            }
            $listPageUrl = $listConfig->list_url;
            $pageUrl = str_replace(SpiderTask::PAGE, 1, $listPageUrl);
            $html = SpiderService::getHtml($pageUrl, $task->cookie);
            # 1: 获取列表页码
            $list_page_param = [
                'rule' => $listConfig->list_page_matche,
                'type' => $listConfig->list_page_attr,
            ];
            $maxPage = SpiderService::formatPageParam($html, $list_page_param);
            \Log::info('分析最大页 = '.$maxPage);
            # 2:循环列表页
            for ($p = 1; $p <= $maxPage; $p++) {
                $curListPageUrl = str_replace(SpiderTask::PAGE, $p, $listPageUrl);
                $curListHtml = SpiderService::getHtml($curListPageUrl, $task->cookie);
                $listParams = [
                    'list_url' => $listConfig->list_url,
                    'list_param' => $list_param
                ];
                $listDatas = SpiderService::testListQuery($curListHtml, $listParams);
                \Log::info('开始第'.$p.'页 url= '.$curListPageUrl. ' - 当前页数据量 = '.count($listDatas));
                if (count($listDatas) > 0) {
                    # 获取到列表页数据
                    foreach ($listDatas as $data) {
                        if (isset($data['url'])) {
                            $detailUrl = $data['url'];
                            if (!SpiderArticleContent::where('task_id', $task->id)->where('detail_url', $detailUrl)->first()) {
                                \Log::info('开始第'.$p.'页详情页 url= '.$detailUrl);
                                $this->spiderDetail($data, $detailUrl, $task, $listConfig);
                                sleep(1);
                            } else {
                                break;
                            }
                        }else{
                            \Log::info('开始第'.$p.'页 没有url -> '.json_encode($data, true));
                        }
                    }
                    sleep(1);
                }
            }
        } catch (\Exception $e) {
            \Log::info($e->getMessage());
        }
    }

    /**
     * 定时采集任务 - 获取详情页数据
     * @param $data
     * @param $detailUrl
     * @param $task
     */
    public function spiderDetail($listData, $detailUrl, $task, $listConfig)
    {
        $listConfig['page_param'] = [
            'rule' => $listConfig->detail_page_matche,
            'type' => $listConfig->detail_page_attr,
        ];
        $listConfig['data_param'] = json_decode($listConfig->detail_url_content, true);
        $result = self::detail($detailUrl, $listConfig->toArray(), $task->cookie);
        if ($result) {
            // 存储数据
            if (isset($listData['category'])) {
                $article_category = $listData['category'];
            } elseif (isset($result['category'])) {
                $article_category = $result['category'];
            } else {
                $article_category = '';
            }
            $detail = [
                'task_id' => $task->id,
                'task_config_id' => $listConfig->id,
                'list_url' => $listData['url'] ?? '',
                'detail_url' => $detailUrl,
                'content' => $result['content'] ?? '',
                'title' => $result['title'] ?? '',
                'table_template' => $result['table_template'] ?? '',
                'published_at' => $result['published_at'] ?? date('Y-m-d'),
                'article_category' => $article_category,
                'cover' => $listData['cover'] ?? '',
            ];
            $con = SpiderArticleContent::create($detail);
            \Log::info('数据存储详情页内容 url= '.$detailUrl. ' - contentID='.$con->id);
        }
    }

    /**
     * 获取详情页数据
     * @param $pageUrl
     * @param $detailParams
     * @param $cookie
     * @return array
     */
    public static function detail($pageUrl, $detailParams, $cookie)
    {
        $maxPage = 1;
        $result = [];
        $content = '';
        $data = [];
        $purifier = array('HTML.Allowed' => 'div,p,img[width|src]', 'CSS.AllowedProperties' => '');
        # 1: 判定是否需要分页
        if ($detailParams['detail_use_page'] == 1) {
            $html = SpiderService::getHtml($pageUrl, $cookie);
            $maxPage = SpiderService::formatPageParam($html, $detailParams['page_param']);
            $curpageData = SpiderService::articleDetailQuery($html, $detailParams['data_param']);
            if (count($curpageData) > 0) {
                $data = $curpageData[0];
                $content .= isset($data['content']) ? $data['content'] : '';
            }
            # 如果需要分页 多次请求内容合并
            if ($maxPage > 1) {
                for ($p = 1; $p <= $maxPage; $p++) {
                    if ($p == 1) continue;
                    $pageUrl = str_replace(SpiderTask::PAGE, $p, $detailParams['fenye_url']);
                    $fenyeContent = SpiderService::articleDetailQuery(SpiderService::getHtml($pageUrl, $cookie), $detailParams['data_param']);
                    if (count($fenyeContent) > 0 && isset($fenyeContent[0]['content'])) {
                        $content .= $fenyeContent[0]['content'];
                    }
                }
            }
        } else {
            $datas = SpiderService::articleDetailQuery(SpiderService::getHtml($pageUrl, $cookie), $detailParams['data_param']);
            if (count($datas) == 0) return ['code' => 201, 'message' => '数据为空'];
            $data = $datas[0];
            $content = $data['content'];
        }
        if ($content != '') {
            $formatContent = FunctionService::formatHtml($content, $purifier);
            if (isset($data['published_at']) && strtotime($data['published_at'])) {
                $published_at = $data['published_at'];
            } else {
                $published_at = date('Y-m-d');
            }
            $result = [
                'title' => $data['title'],
                'category' => $data['category'] ?? '',
                'published_at' => $published_at,
                'content' => $formatContent,
                'table_template' => isset($detailParams['table_template']) ? $detailParams['table_template'] : 'article',
            ];
            return $result;
        }
        return [];
    }


    public static function getHtml($url, $cookie = '', $is_roxy = false)
    {
        try {
            $headers = FunctionService::setHeader($url, $cookie, $is_roxy);
            $html = QueryList::get($url, [], $headers)->removeHead()->getHtml();
            $encode = mb_detect_encoding($html, ['ASCII', 'GB2312', 'GBK', 'UTF-8', 'BIG5']);
            $html = mb_convert_encoding($html, 'UTF-8', $encode);
//        $html   = FunctionService::formatHtmlSpace($html);
            return $html;
        } catch (\Exception $e) {
            return '';
        }
    }

    /**
     * 分析最大页码
     */
    public static function formatPageParam($html, $pageRules)
    {
        # 如果是数字表示指定页码
        if (is_numeric($pageRules['rule'])) {
            return $pageRules['rule'];
        }


        if ($pageRules['rule'] == '') return 1;
        if (strpos($pageRules['rule'], '$') !== false) {
            # 包含 $ 为元素选择器
            $pageSizeRules = [
                'maxPage' => [$pageRules['rule'], $pageRules['type']],
            ];
            $datas = QueryList::html($html)->rules($pageSizeRules)->query()->getData();
            return $datas->first() ? $datas->first()['maxPage'] : 1;
        } else {
            # 不包含 $ 为正则匹配
            $rule = str_replace('/', '\/', $pageRules['rule']);
            $preg_match = preg_match('/' . $rule . '/i', $html, $match);
            if ($preg_match && $match) {
                return isset($match[1]) ? $match[1] : 1;
            }
            return 1;
        }
    }

    /**
     * 测试请求列表页数据
     */
    public static function testListQuery($html, $listParams)
    {
        try {
            $formatUrlDomain = SpiderService::formatUrlDomain($listParams['list_url']);
            # 根据url前缀拼接规则获取数据
            $urlPrefix = $listParams['list_param'][0]['url_prefix'];
            if ($urlPrefix == 1) {
                $preUrl = $formatUrlDomain['host'];
            } elseif ($urlPrefix == 2) {
                $preUrl = $formatUrlDomain['p_url'];
            } else {
                $preUrl = '';
            }

            $listRuleMatchs = [];
            $listRule = [];
            foreach ($listParams['list_param'] as $key => $listParam) {
                if ($listParam['rule'] != '') {
                    $listRule[$listParam['key']] = [$listParam['rule'], $listParam['type']];
                    $listRuleMatchs[$listParam['key']] = $listParam['rule_match'];
                }
            }
            $listDatas = QueryList::html($html)->rules($listRule)->query()->getData()->toArray();
            if (count($listDatas) > 0) {
                # 循环数据 - 重组 (1:url拼接 2: 爬去字段前后截取)
                foreach ($listDatas as $k => $data) {
                    if (isset($data['url'])) {
                        $detailUrl = $preUrl . $data['url'];
                        $listDatas[$k]['url'] = $detailUrl;
                        foreach ($listRuleMatchs as $curKey => $listRuleMatch) {
                            if ($listRuleMatch) {
                                $listDatas[$k][$curKey] = self::ruleMatchData($listRuleMatch, $data[$curKey]);
                            }
                        }
                    }
                }
            }
            return $listDatas;
        } catch (\Exception $e) {
            return [];
        }
    }

    /**
     * 解析详情页数据
     */
    public static function articleDetailQuery($html, $dataParams)
    {
        $rules = [];
        $listRuleMatchs = [];
        foreach ($dataParams as $key => $param) {
            if ($param['rule'] != '') {
                $rules[$param['key']] = [$param['rule'], $param['type']];
                $listRuleMatchs[$param['key']] = $param['rule_match'];
            }
        }
        $detailDatas = QueryList::html($html)->rules($rules)->query()->getData()->toArray();
        if (count($detailDatas) > 0) {
            # 循环数据 - 重组 (1:url拼接 2: 爬去字段前后截取)
            foreach ($detailDatas as $k => $data) {
                if (isset($data['title'])) {
                    foreach ($listRuleMatchs as $curKey => $listRuleMatch) {
                        if ($listRuleMatch) {
                            $detailDatas[$k][$curKey] = self::ruleMatchData($listRuleMatch, $data[$curKey]);
                        }
                    }
                }
            }
        }
        return $detailDatas;
    }

    /**
     * 根据正则前后截取返回
     * @param $rule
     * @param $content
     * @return string
     */
    public static function ruleMatchData($rule, $content)
    {
        try {
            $res = '';
            $rule = str_replace('/', '\/', $rule);
            $rule = str_replace('|', '\|', $rule);
            $con = preg_replace('/\r?\n/', '', $content);
//            $con = str_replace(array("rn", "\n", "\t"), "", $con);
            preg_match_all("/$rule/", $con, $matches);

            if (isset($matches[1])) {
                return $matches[count($matches) - 1][0];
            } else {
                return $content;
            }
            return $res;
        } catch (\Exception $e) {
            return $content;
        }
    }

    /**
     * 格式化url
     * @param $pageUrl
     * @return array
     */
    public static function formatUrlDomain($pageUrl)
    {
        try {
            $parse_url = parse_url($pageUrl);
            if (strpos($pageUrl, '[PAGE]') === false) {
                $p_url = $pageUrl;
            } else {
                $p_url = substr($pageUrl, 0, strrpos($pageUrl, '/'));
            }
            return ['host' => $parse_url['scheme'] . '://' . $parse_url['host'], 'p_url' => $p_url];
        } catch (\Exception $e) {
            return ['host' => $pageUrl, 'p_url' => $pageUrl];
        }
    }
}